In this study we will be using publicly available datasets such as in the curatedMetagenomicData package (shotgun metagenomics) and the MicrobeDS package (for 16S rRNA gene data) to explore the presence of Lactobacilli on the skin. We will also use this data to compare with the other niches available in these datasets. In addition we will also import or own 16S rRNA gene dataset.
Let’s load all necessary packages:
suppressMessages(library(curatedMetagenomicData))
packageVersion("curatedMetagenomicData")
## [1] '1.12.3'
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ readr 1.1.1
## ✔ tibble 1.4.2 ✔ purrr 0.2.5
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ ggplot2 3.1.0 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ Biobase::combine() masks BiocGenerics::combine(), dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
suppressMessages(library(phyloseq))
packageVersion("phyloseq")
## [1] '1.26.0'
suppressMessages(library(ggpubr))
packageVersion("ggpubr")
## [1] '0.2'
suppressMessages(library(ggtree))
packageVersion("ggtree")
## [1] '1.14.2'
suppressMessages(library(RColorBrewer))
packageVersion("RColorBrewer")
## [1] '1.1.2'
suppressMessages(library(scales))
packageVersion("scales")
## [1] '1.0.0'
Let’s see how many datasets of the curatedMetagenomicData have skin datasets
n_samples <- combined_metadata %>%
group_by(body_site) %>%
summarise(number_of_samples = n())
combined_metadata %>%
group_by(body_site) %>%
summarise(count = n()) %>%
ggplot(aes(x = body_site, fill = body_site, y = count)) +
geom_col() +
geom_text(aes(label = count, y = 5200)) +
scale_fill_brewer(palette = "Dark2") +
coord_flip()
There are around 500 skin samples available!
The easiest format to work with will be the metaphlan bugs. Let’s see how many datasets we can find with this dataformat:
curatedMetagenomicData("*metaphlan_bugs*.skin", dryrun = T)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
## [1] "ChngKR_2016.metaphlan_bugs_list.skin"
## [2] "HMP_2012.metaphlan_bugs_list.skin"
## [3] "OhJ_2014.metaphlan_bugs_list.skin"
## [4] "OlmMR_2017.metaphlan_bugs_list.skin"
## [5] "TettAJ_2016.metaphlan_bugs_list.skin"
These are three different datesets from three different studies:
Let’s download these:
datasets <- curatedMetagenomicData("*metaphlan_bugs*.skin", dryrun = F)
## Working on ChngKR_2016.metaphlan_bugs_list.skin
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1720'
## Working on HMP_2012.metaphlan_bugs_list.skin
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1764'
## Working on OhJ_2014.metaphlan_bugs_list.skin
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1840'
## Working on OlmMR_2017.metaphlan_bugs_list.skin
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1853'
## Working on TettAJ_2016.metaphlan_bugs_list.skin
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1906'
for (i in 1:length(datasets)){
print(str_c("Dataset ",i))
print(datasets[i])
print(datasets[[i]])
}
## [1] "Dataset 1"
## List of length 1
## names(1): ChngKR_2016.metaphlan_bugs_list.skin
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1219 features, 78 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: WBE003 WBE004 ... WBU020 (78 total)
## varLabels: subjectID body_site ... NCBI_accession (20 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27562258
## Annotation:
## [1] "Dataset 2"
## List of length 1
## names(1): HMP_2012.metaphlan_bugs_list.skin
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 27 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRS013258 SRS013261 ... SRS058221 (27 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 22699609
## Annotation:
## [1] "Dataset 3"
## List of length 1
## names(1): OhJ_2014.metaphlan_bugs_list.skin
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 2461 features, 291 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MET0001 MET0002 ... MET0320 (291 total)
## varLabels: subjectID body_site ... NCBI_accession (16 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25279917
## Annotation:
## [1] "Dataset 4"
## List of length 1
## names(1): OlmMR_2017.metaphlan_bugs_list.skin
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 201 features, 4 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: S2_005_022W1 S2_005_012W1 S2_001_023W1 S2_001_006W1
## varLabels: subjectID body_site ... uncurated_metadata (20 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28073918
## Annotation:
## [1] "Dataset 5"
## List of length 1
## names(1): TettAJ_2016.metaphlan_bugs_list.skin
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1002 features, 97 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SK_CT101OSL_t1M14 SK_CT101RCR_t1M14 ... Steph_b2_9
## (97 total)
## varLabels: subjectID body_site ... curator (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28649415
## Annotation:
And now merge all three of them
mergedatasets <- mergeData(datasets)
mergedatasets
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 3193 features, 497 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: ChngKR_2016.metaphlan_bugs_list.skin:WBE003
## ChngKR_2016.metaphlan_bugs_list.skin:WBE004 ...
## TettAJ_2016.metaphlan_bugs_list.skin:Steph_b2_9 (497 total)
## varLabels: subjectID body_site ... studyID (27 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
Now I’ll convert them to phyloseq and melt them in a big dataframe
skin <- ExpressionSet2phyloseq(mergedatasets)
skin <- psmelt(skin)
Filter dataset to contain strain data only and then filter on the genus Lactobacillus:
skin_t_lactos <- skin %>%
filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
write.csv(skin_t_lactos, "out/Lactobacillus_skin_metagenomes.csv")
How many samples contain lactobacilli?
skin_t_lactos %>%
pull(Sample) %>%
unique() %>%
length()
## [1] 183
How many of these samples have an abundance > 1?
skin_t_lactos %>%
filter(Abundance > 1) %>%
pull(Sample) %>%
unique() %>%
length()
## [1] 29
Let’s plot the top 12 species based on presence absence
toplot <- skin_t_lactos %>%
group_by(Species, studyID) %>%
summarise(count = n()) %>%
group_by(Species) %>%
mutate(total_count = sum(count))
top12 <- toplot %>%
arrange(- total_count) %>%
pull(Species) %>%
unique() %>%
.[1:12] %>%
as.character()
toplot %>%
ggplot(aes(x = reorder(Species, - total_count), y = count, fill = studyID)) +
geom_col() +
geom_text(aes(label = total_count, y = - 5)) +
coord_flip() +
scale_fill_brewer(palette = "Dark2") +
xlab("") +
ggtitle("Total number of Lactobacilli in skin samples") +
theme_minimal()
And now let’s explore their relative abundance
skin_t_lactos %>%
group_by(Sample) %>%
mutate(sumabundance = sum(Abundance)) %>%
filter(Species %in% top12) %>%
ggplot() +
geom_jitter(aes(x = gender, y = sumabundance, colour = Species, shape = gender), width = 0.3, size = 3) +
scale_color_brewer(palette = "Paired") +
theme_bw() +
ggtitle("Relative Abundance of Lactobacilli on Skin")
## Warning: Removed 3 rows containing missing values (geom_point).
Is tehre a statistic significant difference?
stats <- skin_t_lactos %>%
group_by(Sample) %>%
mutate(sumabundance = sum(Abundance)) %>%
select(Sample, sumabundance, gender) %>%
distinct %>%
spread(key = gender, value = sumabundance)
wilcox.test(stats$male, stats$female)
##
## Wilcoxon rank sum test with continuity correction
##
## data: stats$male and stats$female
## W = 3404, p-value = 0.08193
## alternative hypothesis: true location shift is not equal to 0
Nope!
But we’d like to know the difference in prevalence between man and women
stat1 <- skin %>%
select(Sample, gender) %>%
distinct() %>%
group_by(gender) %>%
summarise(total = n())
stat1
## # A tibble: 3 x 2
## gender total
## <chr> <int>
## 1 female 197
## 2 male 292
## 3 <NA> 8
And finally, let’s calculate how many samples of all skin samples have lactobacilli in them. We will be using this later to construct Fig1A.
stat_metagenome <- skin %>%
mutate(total_lactos = case_when(Sample %in% (skin %>%
filter(Abundance > 0) %>%
filter(Genus == "Lactobacillus") %>%
pull(Sample) %>%
unique()) ~ "percentage_lacto",
!Sample %in% (skin %>%
filter(Abundance > 0) %>%
filter(Genus == "Lactobacillus") %>%
pull(Sample) %>%
unique()) ~ "non_lacto")) %>%
group_by(total_lactos) %>%
summarise(count = n()/nrow(.)) %>%
mutate(study = "curatedMetagenomicData")
stat_metagenome
## # A tibble: 2 x 3
## total_lactos count study
## <chr> <dbl> <chr>
## 1 non_lacto 0.632 curatedMetagenomicData
## 2 percentage_lacto 0.368 curatedMetagenomicData
Let’s move on to the next niche: oralcavity. There are around 700 samples from the oral cavity available.
Again, let’s use the metaphlan_bugs format.
curatedMetagenomicData("*metaphlan_bugs*.oralcavity", dryrun = T)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
## [1] "BritoIL_2016.metaphlan_bugs_list.oralcavity"
## [2] "Castro-NallarE_2015.metaphlan_bugs_list.oralcavity"
## [3] "HMP_2012.metaphlan_bugs_list.oralcavity"
## [4] "OlmMR_2017.metaphlan_bugs_list.oralcavity"
## [5] "ShiB_2015.metaphlan_bugs_list.oralcavity"
Three studies to be downloaded:
datasets <- curatedMetagenomicData("*metaphlan_bugs*.oralcavity", dryrun = F)
## Working on BritoIL_2016.metaphlan_bugs_list.oralcavity
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1179'
## Working on Castro-NallarE_2015.metaphlan_bugs_list.oralcavity
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1714'
## Working on HMP_2012.metaphlan_bugs_list.oralcavity
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1763'
## Working on OlmMR_2017.metaphlan_bugs_list.oralcavity
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1852'
## Working on ShiB_2015.metaphlan_bugs_list.oralcavity
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1894'
for (i in 1:length(datasets)){
print(str_c("Dataset ",i))
print(datasets[i])
print(datasets[[i]])
}
## [1] "Dataset 1"
## List of length 1
## names(1): BritoIL_2016.metaphlan_bugs_list.oralcavity
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1864 features, 140 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: M1.1.SA M1.10.SA ... WL.9.SA (140 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27409808
## Annotation:
## [1] "Dataset 2"
## List of length 1
## names(1): Castro-NallarE_2015.metaphlan_bugs_list.oralcavity
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 755 features, 32 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: ES_001 ES_007 ... ES_211 (32 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 26336637
## Annotation:
## [1] "Dataset 3"
## List of length 1
## names(1): HMP_2012.metaphlan_bugs_list.oralcavity
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 415 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRS011086 SRS011090 ... SRS078182 (415 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 22699609
## Annotation:
## [1] "Dataset 4"
## List of length 1
## names(1): OlmMR_2017.metaphlan_bugs_list.oralcavity
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 201 features, 4 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: S2_005_028Y1 S2_005_014Y1 S2_001_023Y1 S2_001_006Y1
## varLabels: subjectID body_site ... uncurated_metadata (20 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28073918
## Annotation:
## [1] "Dataset 5"
## List of length 1
## names(1): ShiB_2015.metaphlan_bugs_list.oralcavity
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 729 features, 48 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: S01-01-D S01-02-D ... S31-04-R (48 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25691586
## Annotation:
And again, merge these
mergedatasets <- mergeData(datasets)
mergedatasets
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 2483 features, 639 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: BritoIL_2016.metaphlan_bugs_list.oralcavity:M1.1.SA
## BritoIL_2016.metaphlan_bugs_list.oralcavity:M1.10.SA ...
## ShiB_2015.metaphlan_bugs_list.oralcavity:S31-04-R (639 total)
## varLabels: subjectID body_site ... studyID (28 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
oralcavity <- ExpressionSet2phyloseq(mergedatasets)
oralcavity <- psmelt(oralcavity)
That being fixed we can now start the oral cavity analysis!
# filter dataset for strain data only
oralcavity_t_lactos <- oralcavity %>%
filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
write.csv(oralcavity_t_lactos, "out/Lactobacillus_oralcavity_metagenomes.csv")
What Lactobacillus species can be found in the oralcavity?
toplot <- oralcavity_t_lactos %>%
group_by(Species, studyID) %>%
summarise(count = n()) %>%
group_by(Species) %>%
mutate(total_count = sum(count))
top12 <- toplot %>%
arrange(- total_count) %>%
pull(Species) %>%
unique() %>%
.[1:12] %>%
as.character()
toplot %>%
ggplot(aes(x = reorder(Species, - total_count), y = count, fill = studyID)) +
geom_col() +
geom_text(aes(label = total_count, y = - 5)) +
coord_flip() +
scale_fill_brewer(palette = "Dark2") +
xlab("") +
ggtitle("Total number of Lactobacilli in oralcavity samples") +
theme_minimal()
Wow, okay. The count is way lower in this niche compared to the skin. Let’s look at their abundance
oralcavity_t_lactos %>%
group_by(Sample) %>%
mutate(sumabundance = sum(Abundance)) %>%
filter(Species %in% top12) %>%
ggplot() +
geom_jitter(aes(x = gender, y = sumabundance, colour = Species, shape = gender), width = 0.3, size = 3) +
scale_color_brewer(palette = "Paired") +
theme_bw() +
ggtitle("Relative Abundance of Lactobacilli in oralcavity")
On to the next niche, the nasal niche!
Again the same story:
curatedMetagenomicData("*metaphlan_bugs*.nasalcavity", dryrun = T)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
## [1] "HMP_2012.metaphlan_bugs_list.nasalcavity"
Let’s download this dataset:
datasets <- curatedMetagenomicData("*metaphlan_bugs*.nasalcavity", dryrun = F)
## Working on HMP_2012.metaphlan_bugs_list.nasalcavity
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1762'
for (i in 1:length(datasets)){
print(str_c("Dataset ",i))
print(datasets[i])
print(datasets[[i]])
}
## [1] "Dataset 1"
## List of length 1
## names(1): HMP_2012.metaphlan_bugs_list.nasalcavity
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 93 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRS011105 SRS011132 ... SRS065179 (93 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 22699609
## Annotation:
And merge it
mergedatasets <- mergeData(datasets)
mergedatasets
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 93 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HMP_2012.metaphlan_bugs_list.nasalcavity:SRS011105
## HMP_2012.metaphlan_bugs_list.nasalcavity:SRS011132 ...
## HMP_2012.metaphlan_bugs_list.nasalcavity:SRS065179 (93 total)
## varLabels: subjectID body_site ... studyID (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
And finally convert to phyloseq.
nasalcavity <- ExpressionSet2phyloseq(mergedatasets)
nasalcavity <- psmelt(nasalcavity)
Can we find lactobacilli in the nose?
# filter dataset for strain data only
nasalcavity_t_lactos <- nasalcavity %>%
filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
write.csv(nasalcavity_t_lactos, "out/Lactobacillus_nasalcavity_metagenomes.csv")
Top 12 plot
toplot <- nasalcavity_t_lactos %>%
group_by(Species, studyID) %>%
summarise(count = n()) %>%
group_by(Species) %>%
mutate(total_count = sum(count))
top12 <- toplot %>%
arrange(- total_count) %>%
pull(Species) %>%
unique() %>%
.[1:12] %>%
as.character()
toplot %>%
ggplot(aes(x = reorder(Species, - total_count), y = count, fill = studyID)) +
geom_col() +
geom_text(aes(label = total_count, y = - 5)) +
coord_flip() +
scale_fill_brewer(palette = "Dark2") +
xlab("") +
ggtitle("Total number of Lactobacilli in nasal samples") +
theme_minimal()
That’s not a lot…
nasalcavity_t_lactos %>%
group_by(Sample) %>%
mutate(sumabundance = sum(Abundance)) %>%
filter(Species %in% top12) %>%
ggplot() +
geom_jitter(aes(x = gender, y = sumabundance, colour = Species, shape = gender), width = 0.3, size = 3) +
scale_color_brewer(palette = "Paired") +
theme_bw() +
ggtitle("Relative Abundance of Lactobacilli in nasal cavity")
Some of them have a remarkable high relative abundance.
However, due to the very low sample numbers we will not be using them.
What about mother’s milk?
curatedMetagenomicData("*metaphlan_bugs*.milk", dryrun = T)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
## [1] "AsnicarF_2017.metaphlan_bugs_list.milk"
datasets <- curatedMetagenomicData("*metaphlan_bugs*.milk", dryrun = F)
## Working on AsnicarF_2017.metaphlan_bugs_list.milk
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1699'
for (i in 1:length(datasets)){
print(str_c("Dataset ",i))
print(datasets[i])
print(datasets[[i]])
}
## [1] "Dataset 1"
## List of length 1
## names(1): AsnicarF_2017.metaphlan_bugs_list.milk
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 799 features, 8 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MV_MIM1_t1M14 MV_MIM2_t1M14 ... MV_MIM5_t3F15 (8
## total)
## varLabels: subjectID body_site ... NCBI_accession (20 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28144631
## Annotation:
mergedatasets <- mergeData(datasets)
mergedatasets
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 799 features, 8 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames:
## AsnicarF_2017.metaphlan_bugs_list.milk:MV_MIM1_t1M14
## AsnicarF_2017.metaphlan_bugs_list.milk:MV_MIM2_t1M14 ...
## AsnicarF_2017.metaphlan_bugs_list.milk:MV_MIM5_t3F15 (8 total)
## varLabels: subjectID body_site ... studyID (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
milk <- ExpressionSet2phyloseq(mergedatasets)
milk <- psmelt(milk)
Can we find lactobacilli here?
# filter dataset for strain data only
milk_t_lactos <- milk %>%
filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
write.csv(milk_t_lactos, "out/Lactobacillus_milk_metagenomes.csv")
Nope, the dataset is completely empty…
And then now, the vaginal niche! Same story as above.
curatedMetagenomicData("*metaphlan_bugs*vagina", dryrun = T)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
## [1] "HMP_2012.metaphlan_bugs_list.vagina"
Downloading
datasets <- curatedMetagenomicData("*metaphlan_bugs*vagina", dryrun = F)
## Working on HMP_2012.metaphlan_bugs_list.vagina
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1766'
for (i in 1:length(datasets)){
print(str_c("Dataset ",i))
print(datasets[i])
print(datasets[[i]])
}
## [1] "Dataset 1"
## List of length 1
## names(1): HMP_2012.metaphlan_bugs_list.vagina
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 67 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRS011111 SRS011269 ... SRS078197 (67 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 22699609
## Annotation:
Merging of datasets
mergedatasets <- mergeData(datasets)
mergedatasets
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 67 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HMP_2012.metaphlan_bugs_list.vagina:SRS011111
## HMP_2012.metaphlan_bugs_list.vagina:SRS011269 ...
## HMP_2012.metaphlan_bugs_list.vagina:SRS078197 (67 total)
## varLabels: subjectID body_site ... studyID (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
Let’s convert to phyloseq.
vagina <- ExpressionSet2phyloseq(mergedatasets)
vagina <- psmelt(vagina)
How many lactobacillus species can we find?
# filter dataset for strain data only
vagina_t_lactos <- vagina %>%
filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
write.csv(vagina_t_lactos, "out/Lactobacillus_vagina_metagenomes.csv")
Top12 species plot:
toplot <- vagina_t_lactos %>%
group_by(Species, studyID) %>%
summarise(count = n()) %>%
group_by(Species) %>%
mutate(total_count = sum(count))
top12 <- toplot %>%
arrange(- total_count) %>%
pull(Species) %>%
unique() %>%
.[1:12] %>%
as.character()
toplot %>%
ggplot(aes(x = reorder(Species, - total_count), y = count, fill = studyID)) +
geom_col() +
geom_text(aes(label = total_count, y = - 5)) +
coord_flip() +
scale_fill_brewer(palette = "Dark2") +
xlab("") +
ggtitle("Total number of Lactobacilli in vaginal samples") +
theme_minimal()
Relative abundance
vagina_t_lactos %>%
group_by(Sample) %>%
mutate(sumabundance = sum(Abundance)) %>%
filter(Species %in% top12) %>%
ggplot() +
geom_jitter(aes(x = gender, y = sumabundance, colour = Species, shape = gender), width = 0.3, size = 3) +
scale_color_brewer(palette = "Paired") +
theme_bw() +
ggtitle("Relative Abundance of Lactobacilli in vagina")
As expected, this is a lot higher!
The final niche will be the human stool niche. Sorry for the repetition, I know it’s probably getting boring.
curatedMetagenomicData("*metaphlan_bugs*stool", dryrun = T)
## Dry run: see return values for datasets that would be downloaded. Run with `dryrun=FALSE` to actually download these datasets.
## [1] "AsnicarF_2017.metaphlan_bugs_list.stool"
## [2] "Bengtsson-PalmeJ_2015.metaphlan_bugs_list.stool"
## [3] "BritoIL_2016.metaphlan_bugs_list.stool"
## [4] "DavidLA_2015.metaphlan_bugs_list.stool"
## [5] "FengQ_2015.metaphlan_bugs_list.stool"
## [6] "HanniganGD_2017.metaphlan_bugs_list.stool"
## [7] "Heitz-BuschartA_2016.metaphlan_bugs_list.stool"
## [8] "HMP_2012.metaphlan_bugs_list.stool"
## [9] "KarlssonFH_2013.metaphlan_bugs_list.stool"
## [10] "KosticAD_2015.metaphlan_bugs_list.stool"
## [11] "LeChatelierE_2013.metaphlan_bugs_list.stool"
## [12] "LiJ_2014.metaphlan_bugs_list.stool"
## [13] "LiJ_2017.metaphlan_bugs_list.stool"
## [14] "LiSS_2016.metaphlan_bugs_list.stool"
## [15] "LiuW_2016.metaphlan_bugs_list.stool"
## [16] "LomanNJ_2013.metaphlan_bugs_list.stool"
## [17] "LoombaR_2017.metaphlan_bugs_list.stool"
## [18] "LouisS_2016.metaphlan_bugs_list.stool"
## [19] "NielsenHB_2014.metaphlan_bugs_list.stool"
## [20] "Obregon-TitoAJ_2015.metaphlan_bugs_list.stool"
## [21] "OlmMR_2017.metaphlan_bugs_list.stool"
## [22] "PasolliE_2018.metaphlan_bugs_list.stool"
## [23] "QinJ_2012.metaphlan_bugs_list.stool"
## [24] "QinN_2014.metaphlan_bugs_list.stool"
## [25] "RampelliS_2015.metaphlan_bugs_list.stool"
## [26] "RaymondF_2016.metaphlan_bugs_list.stool"
## [27] "SchirmerM_2016.metaphlan_bugs_list.stool"
## [28] "SmitsSA_2017.metaphlan_bugs_list.stool"
## [29] "ThomasAM_2018a.metaphlan_bugs_list.stool"
## [30] "VatanenT_2016.metaphlan_bugs_list.stool"
## [31] "VincentC_2016.metaphlan_bugs_list.stool"
## [32] "VogtmannE_2016.metaphlan_bugs_list.stool"
## [33] "WenC_2017.metaphlan_bugs_list.stool"
## [34] "XieH_2016.metaphlan_bugs_list.stool"
## [35] "YuJ_2015.metaphlan_bugs_list.stool"
## [36] "ZellerG_2014.metaphlan_bugs_list.stool"
Damn! That’s way more data than all other datasets! Here I will be downloading them:
datasets <- curatedMetagenomicData("*metaphlan_bugs*stool", dryrun = F)
## Working on AsnicarF_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1700'
## Working on Bengtsson-PalmeJ_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1708'
## Working on BritoIL_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1180'
## Working on DavidLA_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1726'
## Working on FengQ_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1732'
## Working on HanniganGD_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1738'
## Working on Heitz-BuschartA_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1744'
## Working on HMP_2012.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1765'
## Working on KarlssonFH_2013.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1780'
## Working on KosticAD_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1786'
## Working on LeChatelierE_2013.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1254'
## Working on LiJ_2014.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1792'
## Working on LiJ_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1798'
## Working on LiSS_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1804'
## Working on LiuW_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1810'
## Working on LomanNJ_2013.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1816'
## Working on LoombaR_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1822'
## Working on LouisS_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1828'
## Working on NielsenHB_2014.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1290'
## Working on Obregon-TitoAJ_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1834'
## Working on OlmMR_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1854'
## Working on PasolliE_2018.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1864'
## Working on QinJ_2012.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1308'
## Working on QinN_2014.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1870'
## Working on RampelliS_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1876'
## Working on RaymondF_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1882'
## Working on SchirmerM_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1888'
## Working on SmitsSA_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1900'
## Working on ThomasAM_2018a.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1912'
## Working on VatanenT_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1918'
## Working on VincentC_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1924'
## Working on VogtmannE_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1930'
## Working on WenC_2017.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1936'
## Working on XieH_2016.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1942'
## Working on YuJ_2015.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1948'
## Working on ZellerG_2014.metaphlan_bugs_list.stool
## snapshotDate(): 2018-10-30
## see ?curatedMetagenomicData and browseVignettes('curatedMetagenomicData') for documentation
## downloading 0 resources
## loading from cache
## '/home/swuyts//.ExperimentHub/1954'
for (i in 1:length(datasets)){
print(str_c("Dataset ",i))
print(datasets[i])
print(datasets[[i]])
}
## [1] "Dataset 1"
## List of length 1
## names(1): AsnicarF_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 799 features, 16 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MV_FEI1_t1Q14 MV_FEI2_t1Q14 ... MV_FEM5_t3Q15 (16
## total)
## varLabels: subjectID body_site ... NCBI_accession (20 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28144631
## Annotation:
## [1] "Dataset 2"
## List of length 1
## names(1): Bengtsson-PalmeJ_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1220 features, 70 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: TRAVELRES13 TRAVELRES35 ... TRAVELRES52 (70 total)
## varLabels: subjectID body_site ... NCBI_accession (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 26259788
## Annotation:
## [1] "Dataset 3"
## List of length 1
## names(1): BritoIL_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1864 features, 172 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: M1.1.ST M1.10.ST ... WL.9.ST (172 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27409808
## Annotation:
## [1] "Dataset 4"
## List of length 1
## names(1): DavidLA_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 892 features, 49 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: LD-Run2-37 LD-Run2-36 ... LD-Run1-01 (49 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25991682
## Annotation:
## [1] "Dataset 5"
## List of length 1
## names(1): FengQ_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1547 features, 154 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SID31004 SID31009 ... SID532915 (154 total)
## varLabels: subjectID body_site ... NCBI_accession (24 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25758642
## Annotation:
## [1] "Dataset 6"
## List of length 1
## names(1): HanniganGD_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 716 features, 82 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MG100208 MG100207 ... MG100125 (82 total)
## varLabels: subjectID body_site ... NCBI_accession (15 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
## [1] "Dataset 7"
## List of length 1
## names(1): Heitz-BuschartA_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1011 features, 53 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: M01.1-V1-stool M01.1-V2-stool ... M04.6-V3-stool
## (53 total)
## varLabels: subjectID body_site ... NCBI_accession (24 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27723761
## Annotation:
## [1] "Dataset 8"
## List of length 1
## names(1): HMP_2012.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1988 features, 147 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRS011061 SRS011084 ... SRS078176 (147 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 22699609
## Annotation:
## [1] "Dataset 9"
## List of length 1
## names(1): KarlssonFH_2013.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1140 features, 145 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: S112 S118 ... S92 (145 total)
## varLabels: subjectID body_site ... NCBI_accession (34 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 23719380
## Annotation:
## [1] "Dataset 10"
## List of length 1
## names(1): KosticAD_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 869 features, 124 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: PRJNA231909.3105829 PRJNA231909.3108874 ...
## PRJNA231909.3114185 (124 total)
## varLabels: subjectID study_condition ... NCBI_accession (26
## total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25662751
## Annotation:
## [1] "Dataset 11"
## List of length 1
## names(1): LeChatelierE_2013.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1542 features, 292 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MH0001 MH0002 ... MH0462 (292 total)
## varLabels: subjectID body_site ... NCBI_accession (15 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 23985870
## Annotation:
## [1] "Dataset 12"
## List of length 1
## names(1): LiJ_2014.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1613 features, 260 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: DLF005-IE DLM006-IE ... V1.UC64-0 (260 total)
## varLabels: subjectID body_site ... NCBI_accession (15 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 24997786
## Annotation:
## [1] "Dataset 13"
## List of length 1
## names(1): LiJ_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1150 features, 196 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: nHF511704 nHM512705 ... H2M514107 (196 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28143587
## Annotation:
## [1] "Dataset 14"
## List of length 1
## names(1): LiSS_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 854 features, 55 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: FAT_DON_19-22-0-0 FAT_020-22-0-0 ...
## FAT_012-22-84-0 (55 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27126044
## Annotation:
## [1] "Dataset 15"
## List of length 1
## names(1): LiuW_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1078 features, 110 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SRR3992955 SRR3992956 ... SRR3993064 (110 total)
## varLabels: subjectID body_site ... NCBI_accession (15 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27708392
## Annotation:
## [1] "Dataset 16"
## List of length 1
## names(1): LomanNJ_2013.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 736 features, 43 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: OBK1122 OBK1196 ... OBK5066 (43 total)
## varLabels: subjectID body_site ... NCBI_accession (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 23571589
## Annotation:
## [1] "Dataset 17"
## List of length 1
## names(1): LoombaR_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1034 features, 86 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SID0002_gti SID0004_cle ... SID5640_ijm (86 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28467925
## Annotation:
## [1] "Dataset 18"
## List of length 1
## names(1): LouisS_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 657 features, 92 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: AS43_12 AS43_18 ... AS68_6 (92 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 26919743
## Annotation:
## [1] "Dataset 19"
## List of length 1
## names(1): NielsenHB_2014.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1939 features, 396 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MH0001 MH0002 ... V1_UC9_0 (396 total)
## varLabels: subjectID body_site ... NCBI_accession (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 24997787
## Annotation:
## [1] "Dataset 20"
## List of length 1
## names(1): Obregon-TitoAJ_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1094 features, 58 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HCO02 HCO07 ... SM44 (58 total)
## varLabels: subjectID body_site ... NCBI_accession (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25807110
## Annotation:
## [1] "Dataset 21"
## List of length 1
## names(1): OlmMR_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 201 features, 37 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: S2_001_017G1 S2_001_016G1 ... S2_001_018G1 (37
## total)
## varLabels: subjectID body_site ... uncurated_metadata (20 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28073918
## Annotation:
## [1] "Dataset 22"
## List of length 1
## names(1): PasolliE_2018.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1111 features, 112 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: A01_02_1FE A02_01_1FE ... V54_01_1FE (112 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
## [1] "Dataset 23"
## List of length 1
## names(1): QinJ_2012.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1588 features, 363 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: CON-001 CON-002 ... T2D-107 (363 total)
## varLabels: subjectID body_site ... NCBI_accession (21 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 23023125
## Annotation:
## [1] "Dataset 24"
## List of length 1
## names(1): QinN_2014.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1512 features, 237 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: HD-1 HD-10 ... LV-9 (237 total)
## varLabels: subjectID body_site ... NCBI_accession (25 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25079328
## Annotation:
## [1] "Dataset 25"
## List of length 1
## names(1): RampelliS_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 727 features, 38 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: H1 H10 ... IT8 (38 total)
## varLabels: subjectID body_site ... NCBI_accession (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25981789
## Annotation:
## [1] "Dataset 26"
## List of length 1
## names(1): RaymondF_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 834 features, 72 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: P10E0 P10E7 ... P9E90 (72 total)
## varLabels: subjectID body_site ... NCBI_accession (23 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 26359913
## Annotation:
## [1] "Dataset 27"
## List of length 1
## names(1): SchirmerM_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1177 features, 471 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: G88690 G88691 ... G90578 (471 total)
## varLabels: subjectID body_site ... NCBI_accession (23 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27984736
## Annotation:
## [1] "Dataset 28"
## List of length 1
## names(1): SmitsSA_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 343 features, 40 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: TZ_81781 TZ_42864 ... TZ_39227 (40 total)
## varLabels: subjectID antibiotics_current_use ... NCBI_accession
## (17 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28839072
## Annotation:
## [1] "Dataset 29"
## List of length 1
## names(1): ThomasAM_2018a.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1162 features, 80 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: LILT_GF06_16 LILT_GF12_16 ... LILT_VF82_T016 (80
## total)
## varLabels: subjectID body_site ... NCBI_accession (22 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## Annotation:
## [1] "Dataset 30"
## List of length 1
## names(1): VatanenT_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1584 features, 785 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: G69146 G69147 ... G80624 (785 total)
## varLabels: subjectID body_site ... NCBI_accession (24 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27259157
## Annotation:
## [1] "Dataset 31"
## List of length 1
## names(1): VincentC_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1452 features, 229 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MM001.1 MM001.2 ... MM104 (229 total)
## varLabels: subjectID body_site ... NCBI_accession (19 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 26975510
## Annotation:
## [1] "Dataset 32"
## List of length 1
## names(1): VogtmannE_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1296 features, 110 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: MMRS11288076ST-27-0-0 MMRS11664448ST-27-0-0 ...
## MMRS99045180ST-27-0-0 (110 total)
## varLabels: subjectID body_site ... NCBI_accession (18 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27171425
## Annotation:
## [1] "Dataset 33"
## List of length 1
## names(1): WenC_2017.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1075 features, 97 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: AS2raw AS3raw ... AS130raw (97 total)
## varLabels: subjectID body_site ... BMI (30 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 28750650
## Annotation:
## [1] "Dataset 34"
## List of length 1
## names(1): XieH_2016.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1551 features, 250 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: YSZC12003_35365 YSZC12003_35366 ... YSZC12003_37880
## (250 total)
## varLabels: subjectID body_site ... NCBI_accession (31 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 27818083
## Annotation:
## [1] "Dataset 35"
## List of length 1
## names(1): YuJ_2015.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1405 features, 128 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: SZAXPI003409-8 SZAXPI003410-3 ... SZAXPI017595-169
## (128 total)
## varLabels: subjectID body_site ... NCBI_accession (16 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 26408641
## Annotation:
## [1] "Dataset 36"
## List of length 1
## names(1): ZellerG_2014.metaphlan_bugs_list.stool
## ExpressionSet (storageMode: lockedEnvironment)
## assayData: 1585 features, 199 samples
## element names: exprs
## protocolData: none
## phenoData
## sampleNames: CCIS00146684ST-4-0 CCIS00281083ST-3-0 ...
## MMPU99077057ST (199 total)
## varLabels: subjectID body_site ... NCBI_accession (22 total)
## varMetadata: labelDescription
## featureData: none
## experimentData: use 'experimentData(object)'
## pubMedIds: 25432777
## Annotation:
As the size of these datasets is much higher than the others, I will loop over 2 datasets at one time and filter them for Lactobacilli. Only the samples that have Lactobacillus in them will be kept. In the loop I will also do the conversion to phyloseq and the large dataframe.
stool <- tibble()
for (i in 1:(length(datasets)-1)){
stool_temp <- mergeData(datasets[i:(i+1)]) %>%
ExpressionSet2phyloseq() %>%
psmelt() %>%filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
stool <- stool %>%
bind_rows(stool_temp)
}
saveRDS(stool, 'in/stool.rds')
To avoid doing this hard task everytime again, I will just read in a previously saved rds file.
stool <- readRDS("in/stool.rds")
Ready for analysis!
# filter dataset for strain data only
stool_t_lactos <- stool %>%
filter(str_detect(OTU, "t__")) %>%
filter(Genus == "Lactobacillus") %>%
filter(Abundance > 0) %>%
replace_na(list(disease = "Unkown"))
write.csv(stool_t_lactos, "out/Lactobacillus_stool_metagenomes.csv")
Top 12 species
toplot <- stool_t_lactos %>%
group_by(Species, studyID) %>%
summarise(count = n()) %>%
group_by(Species) %>%
mutate(total_count = sum(count))
top12 <- toplot %>%
arrange(- total_count) %>%
pull(Species) %>%
unique() %>%
.[1:12] %>%
as.character()
toplot %>%
ggplot(aes(x = reorder(Species, - total_count), y = count, fill = studyID)) +
geom_col() +
geom_text(aes(label = total_count, y = - 5)) +
coord_flip() +
xlab("") +
ggtitle("Total number of Lactobacilli in stool samples") +
theme_minimal() +
theme(legend.position = "bottom")
Wooooow! This is something completely different!
stool_t_lactos %>%
group_by(Sample) %>%
mutate(sumabundance = sum(Abundance)) %>%
filter(Species %in% top12) %>%
ggplot() +
geom_jitter(aes(x = gender, y = Abundance, colour = Species, shape = gender), width = 0.3, size = 3) +
scale_color_brewer(palette = "Paired") +
theme_bw() +
ggtitle("Relative Abundance of Lactobacilli in stool")
## Warning: Removed 2152 rows containing missing values (geom_point).
Now that we went through all niches, let’s combine these into one big file and plot!
toplot <- oralcavity_t_lactos %>%
bind_rows(nasalcavity_t_lactos) %>%
bind_rows(skin_t_lactos) %>%
bind_rows(stool_t_lactos) %>%
bind_rows(vagina_t_lactos) %>%
group_by(Species, body_site) %>%
summarise(count = n()) %>%
group_by(Species) %>%
mutate(total_count = sum(count))
toplot %>%
ggplot(aes(x = reorder(Species, - total_count), y = count, fill = body_site)) +
geom_col() +
coord_flip() +
scale_fill_brewer(palette = "Dark2") +
xlab("") +
ggtitle("Total number of Lactobacilli per body site") +
theme_minimal() +
facet_grid(~body_site, scales = 'free_x', space = "free_y")
Cool stuff! Let’s do the same for abundance
oralcavity_t_lactos %>%
bind_rows(nasalcavity_t_lactos) %>%
bind_rows(skin_t_lactos) %>%
bind_rows(stool_t_lactos) %>%
bind_rows(vagina_t_lactos) %>%
# filter(Abundance > 1) %>%
ggplot(aes(x = body_site, y = Abundance, fill = body_site)) +
geom_boxplot(outlier.alpha = 0) +
geom_jitter(colour = 'black', shape = 21, size = 0.7, alpha = 0.5, width = 0.2) +
scale_fill_brewer(palette = "Paired") +
theme_bw() +
ggtitle("Relative abundance of Lactobacilli in all body sites")
We’ll also be using our own dataset, namely skin samples collected from cheeck swabs. For this I will read in a csv that my colleague prepared.
partial_16S <- read_csv2("in/baselineCon_ASVlevel.csv", col_types = cols(gender = col_character()))
## Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
## Warning: Missing column names filled in: 'X1' [1]
How many samples do we have in this dataset?
stat1 <- partial_16S %>%
select(sample, gender) %>%
distinct %>%
group_by(gender) %>%
summarise(total = n())
stat1
## # A tibble: 2 x 2
## gender total
## <chr> <int>
## 1 F 15
## 2 M 14
How many of these sample contain lactobacilli? Let’s save this for plotting later in Figure 1A
stat_16S <- partial_16S %>%
mutate(total_lactos = case_when(sample %in% (partial_16S %>%
filter(rel_abundance > 0) %>%
filter(genus == "Lactobacillus") %>%
pull(sample) %>%
unique()) ~ "percentage_lacto",
!sample %in% (partial_16S %>%
filter(rel_abundance > 0) %>%
filter(genus == "Lactobacillus") %>%
pull(sample) %>%
unique()) ~ "non_lacto")) %>%
group_by(total_lactos) %>%
summarise(count = n()/nrow(.)) %>%
mutate(study = "This study")
stat_16S
## # A tibble: 2 x 3
## total_lactos count study
## <chr> <dbl> <chr>
## 1 non_lacto 0.0818 This study
## 2 percentage_lacto 0.918 This study
High percentage of lactos!
Now we will add the V35 dataset from the Human Microbiome Project (HMP) with this usefull package: MicrobeDs (https://github.com/twbattaglia/MicrobeDS). They made the dataset available Phyloseq ready!
# Load Library
library(MicrobeDS)
# Load selected datasets
data('HMPv35')
# Calculate relative abundance
HMPv35_relabun <- transform_sample_counts(HMPv35, function(x) x / sum(x))
# Check number of samples
nsamples(HMPv35_relabun)
## [1] 6000
That’s a nice addition to our analysis!
Next, we’ll be filtering to keep only Lactobacillus OTUs.
# Subset on lactos and use this for visualisation
HMPv35_lactos <- subset_taxa(HMPv35_relabun, Genus == "Lactobacillus") %>%
psmelt() %>%
filter(Abundance > 0)
Now let’s subset the original dataset on Skin as well and then annotate calculate how many samples contain lactos.
# Subset on skin
HMPv35_skin <- subset_samples(HMPv35_relabun, env == "Skin")
HMPv35_skin_df <- data.frame(sample_data(HMPv35_skin)) %>%
rownames_to_column(var = "Sample")
HMP_stats <- HMPv35_skin_df %>%
mutate(total_lactos = case_when(Sample %in% (HMPv35_lactos %>% pull(Sample) %>% unique()) ~ "percentage_lacto",
!Sample %in% (HMPv35_lactos %>% pull(Sample) %>% unique()) ~ "non_lacto")) %>%
group_by(total_lactos) %>%
summarise(count = n()/nrow(.)) %>%
mutate(study = "HMPv35")
Now we’ve got all necessary data to start constructing our figure. Let’s start with Figure 1A.
Fig1A <- stat_16S %>%
bind_rows(stat_metagenome) %>%
bind_rows(HMP_stats) %>%
mutate(total_lactos = case_when(total_lactos == "non_lacto" ~ "No Lactobacilli detected",
total_lactos == "percentage_lacto" ~ "Lactobacilli detected")) %>%
mutate(total_lactos = factor(total_lactos, levels = c("No Lactobacilli detected",
"Lactobacilli detected"))) %>%
mutate(study = factor(study, levels = c("This study", "HMPv35", "curatedMetagenomicData"))) %>%
mutate(count = count*100) %>%
ggplot(aes(x = study, y = count, fill = total_lactos)) +
geom_bar(stat = "identity") +
scale_y_continuous(breaks = c(0, 50, 100)) +
scale_fill_brewer(palette = "Paired") +
ylab("\n Percentage of samples (%)") +
theme_minimal() +
theme(strip.text.x = element_text(face = "bold", size = 14),
axis.text.x = element_text(size = 8, angle = 45, vjust = 1, hjust = 1, face = "bold"),
axis.text.y = element_text(face = "bold", size = 10),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 6),
axis.ticks = element_line(),
legend.title = element_blank(),
panel.grid = element_blank(),
legend.direction = "vertical",
legend.position = "right",
legend.justification = "left",
legend.margin = margin(0,0,0,0),
legend.text = element_text(size = 6),
legend.key.size = unit(6, "pt")) +
ggtitle("")
Fig1A
This figure shows the percentage of skin microbiome samples that have lactobacilli in them from the three different studies.
In Fig1B we will be looking at the relative abundances. For this graph we will be summing all Lactobacillus relative abundances per sample. We thus get one value per sample per body niche.
First we’ll do this for the HMP 16S dataset.
HMP_plot <- HMPv35_lactos %>%
distinct() %>%
group_by(Sample, env) %>%
summarise(summedAbundance = sum(Abundance)*100) %>%
ungroup() %>%
add_count(env) %>%
mutate(study = "HMPv35") %>%
rename(body_site = env) %>%
mutate(body_site = as.character(body_site)) %>%
filter(summedAbundance > 0)
Now also for the above described datasets from the curatedMetagenomicData package, except for the nasal cavity as not a lot of lacto samples were detected there.
curatedMetagenomicData <- oralcavity_t_lactos %>%
# bind_rows(nasalcavity_t_lactos) %>% # Nasal cavity is filtered out because only 3 samples contain lactos
bind_rows(skin_t_lactos) %>%
bind_rows(stool_t_lactos) %>%
bind_rows(vagina_t_lactos) %>%
filter(Abundance > 0) %>%
distinct() %>%
group_by(Sample, body_site) %>%
summarise(summedAbundance = sum(Abundance)) %>%
ungroup() %>%
add_count(body_site) %>%
mutate(study = "curatedMetagenomicData") %>%
mutate(body_site = case_when(body_site == "stool" ~ "Stool",
body_site == "skin" ~ "Skin",
body_site == "vagina" ~ "Vaginal",
# body_site == "nasalcavity" ~ "Nasal",
body_site == "oralcavity" ~ "Oral"))
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
And finally also for our own dataset.
b_16S <- partial_16S %>%
filter(genus == "Lactobacillus") %>%
group_by(sample, gender) %>%
summarise(summedAbundance = sum(rel_abundance)*100) %>%
mutate(study = "This study") %>%
mutate(body_site = "Skin") %>%
mutate(n = nrow(.))
Before we start constructing our Fig 1B we also need to do some counting for the labels. Then we’ll go on with constructing Figure 1B
fig1B_count <- curatedMetagenomicData %>%
bind_rows(HMP_plot) %>%
bind_rows(b_16S) %>%
select(body_site, n, study) %>%
distinct() %>%
mutate(study = factor(study, levels = c("This study", "HMPv35", "curatedMetagenomicData")))
label_format <- format_format(big.mark = " ", decimal.mark = ".", scientific = FALSE)
Fig1B <- curatedMetagenomicData %>%
bind_rows(HMP_plot) %>%
bind_rows(b_16S) %>%
mutate(study = factor(study, levels = c("This study", "HMPv35", "curatedMetagenomicData"))) %>%
mutate(body_site = factor(body_site, levels = c("Skin", "Oral", "Stool", "Vaginal"))) %>%
ggplot(aes(x = body_site, y = summedAbundance, fill = study)) +
geom_boxplot(outlier.alpha = 0, alpha = 0.4) +
geom_point(aes(colour = study), position = position_jitterdodge(jitter.height = 0, jitter.width = 0.1),
shape = 21, size = 0.7, alpha = 0.2) +
geom_text(data = fig1B_count, aes(y = 250, label = str_c("n = ", n), group = study), position = position_dodge(width = 0.8), size = 2, angle = 45, hjust= 0, vjust= 0) +
scale_fill_brewer(palette = "Dark2") +
scale_colour_brewer(palette = "Dark2") +
expand_limits(y = 8000) +
scale_y_log10(labels = label_format, breaks = c(0.01,1,100)) +
ylab("\n Relative abundance (%)") +
theme_minimal() +
theme(strip.text.x = element_text(face = "bold", size = 14),
axis.text.x = element_text(size = 8, angle = 45, hjust = 1, vjust =1, face = "bold"),
axis.text.y = element_text(face = "bold", size = 12),
axis.title.x = element_blank(),
axis.title.y = element_text(size = 8),
axis.ticks = element_line(),
legend.title = element_blank(),
legend.position = 'right',
panel.grid.major.y = element_line(),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank()
) +
ggtitle("")
Fig1B
We need some information from 1E to colour figure 1C. Therefore we will be plotting 1E first.
In this figure we will be counting the top 12 Lactobacillus species from the shotgun metagenomic dataset.
curatedMetagenomicData_top12_species <- skin_t_lactos %>%
group_by(Species) %>%
summarise(count = n()) %>%
separate(Species, into = c('genus', 'species'), sep ="_", extra = "merge") %>%
unite(Species, c("genus", "species"), sep = " ") %>%
arrange(-count) %>%
.[1:12,]
# Create table that connects species to the phylogenetic group they belong to
species_to_phylogenetic_group <- tibble(Species = c("Lactobacillus crispatus",
"Lactobacillus iners",
"Lactobacillus gasseri",
"Lactobacillus jensenii",
"Lactobacillus plantarum",
"Lactobacillus delbrueckii",
"Lactobacillus fermentum",
"Lactobacillus casei_paracasei",
"Lactobacillus sakei",
"Lactobacillus vaginalis",
"Lactobacillus rhamnosus",
"Lactobacillus curvatus"),
clade_name = c("L. delbrueckii group",
"L. delbrueckii group",
"L. delbrueckii group",
"L. delbrueckii group",
"L. plantarum group",
"L. delbrueckii group",
"L. reuteri group",
"L. casei group",
"L. sakei group",
"L. reuteri group",
"L. casei group",
"L. sakei group"))
# Add this data to the dataset
curatedMetagenomicData_top12_species <- curatedMetagenomicData_top12_species %>%
left_join(species_to_phylogenetic_group)
## Joining, by = "Species"
Fig1E <- curatedMetagenomicData_top12_species %>%
ggplot(aes(x = reorder(Species, count), y = count, fill = clade_name)) +
geom_col() +
coord_flip() +
ylab("Count") +
xlab("") +
theme(
# Set text size
axis.title.y = element_text(size = 1,
margin = margin(0,0,0,0)),
axis.title.x = element_text(size = 10,
margin = margin(5,0,0,0)),
axis.text.x = element_text(size = 6,
colour="black",
margin = margin(2,0,0,0)),
axis.text.y = element_text(size = 6,
colour="black",
margin = margin(0,0,0,0),
face = "italic"),
# Configure lines and axes
axis.ticks.x = element_line(colour = "black"),
axis.ticks.y = element_blank(),
# Plot background
panel.background = element_rect(fill = "white"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# Remove border
panel.border = element_rect(fill="transparent",color=NA),
# Remove legend
legend.position = "right",
legend.title = element_blank()
)
Fig1E
We’ll fix the colours later
Time to do the phylogenetic placement. This is done in a separate script, but here we will be writing the ASVs to a file that we will use as input for the placement.
partial_16S %>%
filter(genus == "Lactobacillus") %>%
select(taxon, family, genus, taxon_name) %>%
distinct() %>%
write_tsv("phylogenetic_placement/input/ASVs.tsv")
Now time for the phylogenetic placement! This is probably the hardest part to understand from this whole analysis as we need to do a lot of parsing. But basically we will be reading in a jplace object (which contains oru phylogenetic placements and our 16S rRNA tree) and convert this to a large dataframe which we called tidytree. This makes data handling and mutating much easier. We then manipulate this object to figure out to be able to annotate clades and colour the branches that have a sequenced placed on them.
# Original function by S. Wittouck and adapted by S. Wuyts.
jplace2tidytrees <- function(jplace, outgroup_tips) {
# Get the edgeNumber of each node and store them under edge.length
jplace@phylo$edge.length <- tibble(node = jplace@phylo$edge[, 2]) %>%
left_join(attr(jplace@phylo, "edgeNum")) %>%
pull(edgeNum)
# Identify the MRCA of the outgroup
outgroup_node <- MRCA(jplace@phylo, tip = outgroup_tips)
# Reroot the tree baced on this MRCA
jplace@phylo <- reroot(jplace@phylo, outgroup_node)
# Initiate new nodes table
nodes <- tibble()
for (taxon_index in 1:length(jplace@placements$name)) {
# Get taxon name of placement
taxon_name <- jplace@placements$name[[taxon_index]]
# Get ggtree table
nodes_new <- ggtree(jplace@phylo, branch.length = "none") %>%
.$data %>%
rename(edge_num = branch.length) # Retrieve the edgeNumber that we stored in the branched length above
# Get the placements in a table
placements <- jplace@placements %>%
as_tibble %>%
filter(name == taxon_name) %>%
select(-node)
nodes_new <- left_join(nodes_new, placements) %>%
mutate(placement = ifelse(is.na(like_weight_ratio), "no", "yes")) %>%
mutate(asv = !! taxon_name)
nodes <- bind_rows(nodes, nodes_new)
}
list(tidytrees = nodes, tree = jplace@phylo)
}
# Read in the phylogenetic placement data
jplace <- read.jplace(file = "phylogenetic_placement/data/tree_placement/RAxML_portableTree.placement.jplace")
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
outgroup_tips <- c(
"GCA_001437585.1_1",
"GCA_001438885.1_1",
"GCA_001437015.1_1",
"GCA_001437015.1_2"
)
out <- jplace2tidytrees(jplace, outgroup_tips)
## Joining, by = "node"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Loading required package: phytools
## Loading required package: ape
##
## Attaching package: 'ape'
## The following object is masked from 'package:ggtree':
##
## rotate
## The following object is masked from 'package:ggpubr':
##
## rotate
## Loading required package: maps
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
##
## Attaching package: 'phytools'
## The following objects are masked from 'package:ggtree':
##
## read.newick, reroot
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Joining, by = "edge_num"
tidytrees <- out$tidytrees
tree <- out$tree
# Read in species name for all used genomes
genomes <- read_tsv(file = "phylogenetic_placement/data/genomes.tsv")
## Parsed with column specification:
## cols(
## assembly_accession = col_character(),
## species = col_character(),
## strain = col_character()
## )
# Figure out the best nodes per placed ASV.
asvs_best_node <- tidytrees %>%
group_by(asv) %>%
filter(like_weight_ratio == max(like_weight_ratio, na.rm = T)) %>%
slice(1) %>%
ungroup() %>%
select(asv, best_node = node)
# Optimise tidytree
tidytree <- tidytrees %>%
group_by(node, parent, edge_num, x, y, label, isTip, branch, angle) %>%
summarize(placement = sum(! is.na(like_weight_ratio)) > 0) %>%
mutate(placement = ifelse(placement, "yes", "no")) %>%
mutate(best_placement = ifelse(node %in% asvs_best_node$best_node, "yes", "no")) %>%
mutate(assembly_accession = str_extract(label, "[A-Z]+_[0-9]+\\.[0-9]")) %>%
left_join(genomes) %>%
mutate(species = str_replace_all(species, "_", " ")) %>%
mutate(species = str_extract(species, "[^ ]+ [^ ]+"))
## Joining, by = "assembly_accession"
# find the node numbers of clade MRCAs
tidytree %>%
ggtree(layout = "circular", aes(col = placement), size = 0.5, branch.length = "none") +
geom_tiplab2(aes(label = node), col = "#000000", size = 3) +
scale_color_manual(values = c(no = "#D0D0D0", yes = "#d95f02")) +
theme(plot.title = element_text(hjust = 0.5))
# Define clades based on these numbers
clades <- tibble(
tip1 = c(1, 120, 153, 166, 177),
tip2 = c(23, 115, 165, 170, 177),
clade_name = c(
"L. delbrueckii group",
"L. plantarum group",
"L. casei group",
"L. sakei group",
"L. algidus group"
)
) %>%
mutate(mrca = map2_int(tip1, tip2, function(x, y) MRCA(tree, c(x, y)))) %>%
mutate(col = brewer.pal(nrow(.), name = "Paired") %>% as.character())
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
## Found more than one class "phylo" in cache; using the first, from namespace 'phyloseq'
## Also defined by 'tidytree'
nodes_clades <- clades %>%
group_by(mrca, clade_name) %>%
do({
tibble(node = c(getDescendants(tree = tree, node = .$mrca), .$mrca))
}) %>%
ungroup() %>%
rename(clade = mrca)
colors <- clades$col %>%
set_names(clades$clade_name %>% as.character()) %>%
c(., "other" = "#D0D0D0") %>%
c(., "unclassified" = "#000000") %>%
c(., "L. reuteri group" = "#E31A1C")
# This is necessary for colouring the tips according to the curateddMetagenomics dataset
curatedMetagenomicData_top12_species <- curatedMetagenomicData_top12_species %>%
mutate(Species = if_else(Species == "Lactobacillus casei_paracasei", "Lactobacillus casei", Species)) %>%
rename(tipcolor = clade_name) %>%
rename(species = Species)
# Plot Fig 1C
Fig1C <- tidytree %>%
left_join(nodes_clades) %>%
left_join(curatedMetagenomicData_top12_species) %>%
mutate(clade_name = ifelse(is.na(clade_name), "unclassified", clade_name)) %>%
mutate(tipcolor = ifelse(is.na(tipcolor), "unclassified", tipcolor)) %>%
mutate(color = ifelse(placement == "yes", clade_name, "other")) %>%
ggtree(layout = "circular", aes(col = color), size = 1, branch.length = "none") +
geom_tiplab2(aes(label = species, colour = tipcolor), size = 1.2, offset = 0.8) +
scale_color_manual(values = colors,
breaks = c("L. delbrueckii group",
"L. plantarum group",
"L. casei group",
"L. sakei group",
"L. algidus group",
"unclassified"),
name = "",
labels = c(expression(paste(italic("Lactobacillus delbrueckii")," group")),
expression(paste(italic("Lactobacillus plantarum")," group")),
expression(paste(italic("Lactobacillus casei")," group")),
expression(paste(italic("Lactobacillus sakei")," group")),
expression(paste(italic("Lactobacillus algidus")," group")),
"Unclassified")) +
theme(
legend.position = "none",
legend.text.align = 0
) +
ggplot2::xlim(0,28)
## Joining, by = "node"
## Joining, by = "species"
Fig1C
And then as a final figure we’ll be constructing 1D which shows how many times a sequence was placed in a certain clade.
# Figure out to which group the best node per ASV belongs
Fig1D <- asvs_best_node %>%
left_join(nodes_clades, by = c("best_node" = "node")) %>%
group_by(clade_name) %>%
add_count() %>%
select(clade_name, n) %>%
distinct() %>%
ungroup() %>%
ggplot(aes(x = reorder(clade_name, n), y = n, fill = clade_name)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = colors) +
# scale_y_continuous(breaks = c(0,2,4,6,8)) +
scale_x_discrete(breaks = c("L. delbrueckii group",
"L. plantarum group",
"L. casei group",
"L. sakei group",
"L. algidus group"
),
labels = c(expression(paste(italic("Lactobacillus delbrueckii")," group")),
expression(paste(italic("Lactobacillus plantarum/pentosus")," group")),
expression(paste(italic("Lactobacillus (para)casei/rhamnosus")," group")),
expression(paste(italic("Lactobacillus sakei")," group")),
expression(paste(italic("Lactobacillus algidus")," group"))))+
ylab("Count") +
xlab("") +
theme(
# Set text size
axis.title.y = element_text(size = 1,
margin = margin(0,0,0,0)),
axis.title.x = element_text(size = 10,
margin = margin(5,0,0,0)),
axis.text.x = element_text(size = 6,
colour="black",
margin = margin(2,0,0,0)),
axis.text.y = element_text(size = 6,
colour="black",
margin = margin(0,0,0,0),
face = "italic"),
# Configure lines and axes
axis.ticks.x = element_line(colour = "black"),
axis.ticks.y = element_blank(),
# Plot background
panel.background = element_rect(fill = "white"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
# Remove border
panel.border = element_rect(fill="transparent",color=NA),
# Remove legend
legend.position = "none",
# Align title
plot.title = element_text(hjust = 2.5)
) +
ggtitle("This study")
Fig1D
Now that we have these colours we can recolour figure 1E.
Fig1E <- Fig1E +
geom_blank(data = tibble(Species = "Lactobacillus crispatus", clade_name = "L. algidus group", count = 1)) +
scale_fill_manual(values = colors,
labels = c(expression(paste(italic("L. algidus")," group")),
expression(paste(italic("L. (para)casei/rhamnosus")," group")),
expression(paste(italic("L. delbrueckii")," group")),
expression(paste(italic("L. plantarum/pentosus")," group")),
expression(paste(italic("L. reuteri")," group")),
expression(paste(italic("L. sakei")," group"))
)) +
ggtitle("curatedMetagenomicData") +
theme(legend.text.align = 0,
legend.text = element_text(size = 6),
legend.key.size = unit(6, "pt"))
Fig1E
Finally the: the grand finale!
ggarrange(ggarrange(Fig1A,
Fig1B,
ncol = 2,
widths = c(1,2),
labels = c("a", "b")),
ggarrange(Fig1C,
ncol = 1,
labels = c("c")),
ggarrange(Fig1D,
Fig1E,
labels = c("d", "e"),
ncol = 2,
widths = c(1,2)
),
heights = c(1,2,1),
nrow = 3)
# Low quality
ggsave(filename = "Figure1.png",
width = 178,
height = 210,
units = "mm",
dpi = 300)
# Print quality
# ggsave(filename = "Figure1.tiff",
# width = 178,
# height = 210,
# units = "mm",
# dpi = 300)